In [1]:

    
from preamble import *
%matplotlib notebook



In [2]:

    
mglearn.plots.plot_logistic_regression_graph()









    Out[2]:



In [3]:

    
mglearn.plots.plot_single_hidden_layer_graph()









    Out[3]:



In [4]:

    
line = np.linspace(-3, 3, 100)
plt.figure()
plt.plot(line, np.tanh(line), label="tanh")
plt.plot(line, np.maximum(line, 0), label="relu")
plt.legend(loc="best")
plt.title("activation_functions")









    














    











    Out[4]:





<matplotlib.text.Text at 0x7fa0a2738668>



In [10]:

    
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_moons
from sklearn.cross_validation import train_test_split

X, y = make_moons(n_samples=100, noise=0.25, random_state=3)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)


mlp = MLPClassifier(algorithm='l-bfgs', random_state=0).fit(X_train, y_train)

plt.figure()
mglearn.plots.plot_2d_separator(mlp, X_train, fill=True, alpha=.3)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=60, cmap=mglearn.cm2)









    














    











    Out[10]:





<matplotlib.collections.PathCollection at 0x7f13089d2828>

Exercise

Compare MLPRegressor to linear methods on the boston and bike datasets. Try varying the number of hidden layers and nodes in the hidden layers. Compare adam and l-bfgs algorithms in terms of outcome and time.

How do the results differ with and without scaling the data?